Compare splice junctions against Baruzzo ground truth in human T1, T2, T3¶

In [1]:
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor

sample_ids = [
    "baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1",
    "baruzzo_malaria_t1r1", "baruzzo_malaria_t2r1", "baruzzo_malaria_t3r1"
]

def filter_bam(input_bam, output_bam, mapq_threshold=10):
    """Filters BAM file to retain only reads with MAPQ >= specified threshold."""
    command = ["samtools", "view", "-b", "-q", str(mapq_threshold), "-o", output_bam, input_bam]
    # print(f"Filtering: {' '.join(command)}")
    subprocess.run(command, check=True)


def parallel_filtering(aligner_list, sample_id, mapq_threshold=10, num_threads=4):
    """Filters BAM files in parallel using ThreadPoolExecutor."""
    filtered_bam_dir = "/data/filtered_bams"
    os.makedirs(filtered_bam_dir, exist_ok=True)

    filtered_aligner_list = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_bam = {
            executor.submit(
                filter_bam, bam_file, os.path.join(filtered_bam_dir, f"{aligner}__{sample_id}.MAPQ{mapq_threshold}.bam"), mapq_threshold
            ): (aligner, bam_file)
            for aligner, bam_file in aligner_list
        }

        for future in future_to_bam:
            aligner, bam_file = future_to_bam[future]
            try:
                future.result()  # Wait for the filtering process to complete
                filtered_aligner_list.append([
                    aligner,
                    os.path.join(filtered_bam_dir, f"{aligner}__{sample_id}.MAPQ{mapq_threshold}.bam")
                ])
            except Exception as e:
                print(f"Error filtering {bam_file}: {e}")

    return filtered_aligner_list

def run_featurecounts(aligner_list, genome_file, annotation_file, output_file, output_log, extra_flags, threads=8):
    """
    Run featureCounts on a list of BAM files.
    """
    bam_files = [bam[1] for bam in aligner_list]
    
    command = [
        '/home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts',
        '-T', str(threads),
        '-G', genome_file,
        '-a', annotation_file,
        '-o', output_file
    ] + extra_flags + bam_files

    print("Running command: {}".format(" ".join(command)))
    with open(output_log, "w") as output_log_file:
        subprocess.run(command, check=True, stderr=output_log_file, stdout=output_log_file)

for MAPQ in ["10", "0"]:
    if MAPQ == "10":
        extra_flags = ["-J", "-O", "-Q", MAPQ, "-t", "exon", "-g", "gene_id", "-p", "--primary", "--maxMOp", "16"]
    else:
        extra_flags = ["-J", "-O", "-Q", MAPQ, "-t", "exon", "-g", "gene_id", "-p", "-M", "--maxMOp", "16"]
    
    for sample_id in sample_ids:
        print(f"Processing {sample_id} with MAPQ {MAPQ}")
        aligner_list = [
            ["DeepSAP", f"/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__{sample_id}.sorted.bam"],
            ["DRAGEN", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__{sample_id}.sorted.bam"],
            ["novoSplice", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__{sample_id}.sorted.bam"],
            ["STAR", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__{sample_id}.sorted.bam"],
            ["HISAT2", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__{sample_id}.sorted.bam"],
            ["Subjunc", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__{sample_id}.sorted.bam"],
        ]
        
        output_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}"
        output_log  = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}.log"

        n_threads = 24
        if "malaria" in sample_id:
            genome_file = "/data/references/PFAL_Baruzzo/genome_sequence_pfal.fa"
            annotation_file = '/data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF'
        else:
            genome_file = "/data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa"
            annotation_file = '/data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF'

        # Filter BAMs for MAPQ 10 before running featureCounts
        if MAPQ == "10":
            aligner_list = parallel_filtering(aligner_list, sample_id, mapq_threshold=10, num_threads=8)  # Adjust `num_threads` as needed

        run_featurecounts(aligner_list, genome_file, annotation_file, output_file, output_log, extra_flags, n_threads)
Processing baruzzo_human_t1r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t1r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t1r1.MAPQ10.bam
Processing baruzzo_human_t2r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t2r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t2r1.MAPQ10.bam
Processing baruzzo_human_t3r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t3r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t3r1.MAPQ10.bam
Processing baruzzo_malaria_t1r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t1r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t1r1.MAPQ10.bam
Processing baruzzo_malaria_t2r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t2r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t2r1.MAPQ10.bam
Processing baruzzo_malaria_t3r1 with MAPQ 10
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t3r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t3r1.MAPQ10.bam
Processing baruzzo_human_t1r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t1r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t1r1.sorted.bam
Processing baruzzo_human_t2r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t2r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t2r1.sorted.bam
Processing baruzzo_human_t3r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t3r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t3r1.sorted.bam
Processing baruzzo_malaria_t1r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t1r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t1r1.sorted.bam
Processing baruzzo_malaria_t2r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t2r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t2r1.sorted.bam
Processing baruzzo_malaria_t3r1 with MAPQ 0
Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t3r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t3r1.sorted.bam
In [1]:
from pdf2image import convert_from_path
from IPython.display import display, Image
from utility_sj import plot_grid_for_aligners_baruzzo, parse_featurecounts_junctions, plot_grid_for_aligners_baruzzo, process_featurecounts_and_novel_junctions, plot_grid_for_aligners, extract_cig_junctions, extract_cig_junctions, process_featurecounts_baruzzo , extract_custom_gtf_junctions
import pandas as pd 
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.linear_model import TheilSenRegressor
import numpy as np

aligners = ['DeepSAP', 'DRAGEN', 'novoSplice', 'STAR', 'HISAT2', 'Subjunc']
sample_ids = ["baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1", "baruzzo_malaria_t1r1", "baruzzo_malaria_t2r1", "baruzzo_malaria_t3r1",]

sample_ids = [ "baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1"]

for sample_id  in sample_ids: 
    baruzzo_ground_truth = f'/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/reads/{sample_id}.cig'
    # baruzzo_junctions_df = extract_cig_junctions(baruzzo_ground_truth)
    # baruzzo_junctions_df.to_csv(f'/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/ground_truth/{sample_id}_junctions_df_.csv', index=False)
    baruzzo_junctions_df = pd.read_csv(f'/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/ground_truth/{sample_id}_junctions_df_.csv')

    for MAPQ in ["0"]:
        print(f"\nGenerating plot for {sample_id} and MAPQ {MAPQ}")
        featurecounts_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}.jcounts"
        counted_junctions_df = process_featurecounts_baruzzo(featurecounts_file)
        counted_junctions_df.to_csv(f'plots/featureCounts/processed_counts/sj_counts_{sample_id}_{MAPQ}.csv', index=False)
        
        # counted_junctions_df = pd.read_csv(f'plots/featureCounts/processed_counts/sj_counts_{sample_id}_{MAPQ}.csv')
        
        matching_columns = ['Chromosome', 'Exon1_End', 'Exon2_Start']
        merged_df = pd.merge(baruzzo_junctions_df, counted_junctions_df, on=matching_columns, how='outer')
        merged_df = merged_df.fillna(0)
        
        # Compute Pearson correlation and p-values
        aligners = ['DeepSAP', 'DRAGEN', 'novoSplice', 'STAR', 'HISAT2', 'Subjunc']
        correlation_results = []

        pdf_path = f'plots/featureCounts/figures/aligner_vs_groundtruth_{sample_id}_MAPQ{MAPQ}.pdf'
        png_path = f'plots/featureCounts/figures/aligner_vs_groundtruth_{sample_id}_MAPQ{MAPQ}.png'

        # Plottting for final pdf 
        plot_grid_for_aligners_baruzzo(merged_df, aligners, pdf_path, log_scale=True)
 
        # Plotting just to show plots
        images = convert_from_path(pdf_path, dpi=300)
        resized_image = images[0].resize((images[0].width // 2, images[0].height // 2))
        resized_image.save(png_path, 'PNG')
        display(Image(png_path))
Generating plot for baruzzo_human_t1r1 and MAPQ 0
DeepSAP: heil-Sen slope= 0.999998, 1.000
DRAGEN: heil-Sen slope= 0.956426, 0.956
novoSplice: heil-Sen slope= 0.978782, 0.979
STAR: heil-Sen slope= 0.957513, 0.958
HISAT2: heil-Sen slope= 0.999989, 1.000
Subjunc: heil-Sen slope= 0.999940, 1.000
No description has been provided for this image
Generating plot for baruzzo_human_t2r1 and MAPQ 0
DeepSAP: heil-Sen slope= 0.999965, 1.000
DRAGEN: heil-Sen slope= 0.945790, 0.946
novoSplice: heil-Sen slope= 0.965219, 0.965
STAR: heil-Sen slope= 0.910179, 0.910
HISAT2: heil-Sen slope= 0.879977, 0.880
Subjunc: heil-Sen slope= 0.948834, 0.949
No description has been provided for this image
Generating plot for baruzzo_human_t3r1 and MAPQ 0
DeepSAP: heil-Sen slope= 0.876132, 0.876
DRAGEN: heil-Sen slope= 0.810139, 0.810
novoSplice: heil-Sen slope= 0.770251, 0.770
STAR: heil-Sen slope= 0.313507, 0.314
HISAT2: heil-Sen slope= 0.000351, 0.000
Subjunc: heil-Sen slope= 0.197540, 0.198
No description has been provided for this image

Compare splice junctions detection of DRAGEN, novoSplice, STAR, HISAT2 and Subjunc against DeepSAP in human T1, T2, T3¶

In [1]:
from pdf2image import convert_from_path
from IPython.display import display, Image

from utility_sj import parse_featurecounts_junctions, detect_novel_junctions, process_featurecounts_and_novel_junctions, plot_grid_for_aligners, extract_cig_junctions, extract_cig_junctions, process_featurecounts_baruzzo , extract_gtf_junctions, extract_custom_gtf_junctions
from utility_sj import plot_grid_for_aligners_2columns_deepSAP_vs_rest

baruzzo_gtf = "/data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF" 
baruzzo_junctions = extract_custom_gtf_junctions(baruzzo_gtf)
orientation = "horizontal"

for MAPQ in ["0"]:
    for T in ["1", "2", "3"]:
        
        featurecounts_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t{T}r1_{MAPQ}.jcounts"
    
        output_pdf = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/figures_pair/baruzzo_human_t{T}_MAPQ{MAPQ}_sj_pairwise.pdf"
        output_png = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/figures_pair/baruzzo_human_t{T}_MAPQ{MAPQ}_sj_pairwise.png"

        counted_junctions = parse_featurecounts_junctions(featurecounts_file)

        novel_junctions = counted_junctions - baruzzo_junctions 
        print(f"Processing Baruzzo Human T{T}")
        print("GTF junctions:\t", len(baruzzo_junctions))
        print("FC  junctions:\t", len(counted_junctions))
        print("FC novel junctions:\t", len(novel_junctions))

        df = process_featurecounts_and_novel_junctions(featurecounts_file, novel_junctions)

        aligner_mapping = {
            f'/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t{T}r1.sorted.bam': 'DeepSAP',
            f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t{T}r1.sorted.bam': 'DRAGEN',
            f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t{T}r1.sorted.bam': 'novoSplice',
            f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t{T}r1.sorted.bam': 'STAR',
            f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t{T}r1.sorted.bam': 'HISAT2',
            f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t{T}r1.sorted.bam': 'Subjunc'
        }

        # Rename columns in the DataFrame using the aligner_mapping dictionary
        df.rename(columns=aligner_mapping, inplace=True)

        plot_grid_for_aligners_2columns_deepSAP_vs_rest(df, "DeepSAP", [  "DRAGEN","novoSplice", "STAR", "HISAT2", "Subjunc"], output_pdf, log_scale=True, add_constant=1, limit=3, orient=orientation)
        
        images = convert_from_path(output_pdf, dpi=300)  # save image to png then showing it

        resized_image = images[0].resize((images[0].width // 2, images[0].height // 2))

        resized_image.save(output_png, 'PNG')

        display(Image(output_png)) 
Processing Baruzzo Human T1
GTF junctions:	 191293
FC  junctions:	 175019
FC novel junctions:	 36217
No description has been provided for this image
Processing Baruzzo Human T2
GTF junctions:	 191293
FC  junctions:	 195696
FC novel junctions:	 56865
No description has been provided for this image
Processing Baruzzo Human T3
GTF junctions:	 191293
FC  junctions:	 245470
FC novel junctions:	 106359
No description has been provided for this image
In [ ]: